Internet Info 1994 March

home *** CD-ROM | disk | FTP | other *** search

/ Internet Info 1994 March / Internet Info CD-ROM (Walnut Creek) (March 1994).iso / standards / sgml / nist / parse3 / digets.sav < prev next >

Wrap

Text File | 1990-09-13 | 44.9 KB | 1,284 lines

/* National Institute of Standards and Technology (NIST) /* National Computer System Laboratory (NCSL) /* Office Systems Engineering (OSE) Group /* ******************************************************************** /* D I S C L A I M E R /* (March 8, 1989) /* /* There is no warranty for the NIST NCSL OSE SGML parser and/or the NIST /* NCSL OSE SGML parser validation suite. If the SGML parser and/or /* validation suite is modified by someone else and passed on, NIST wants /* the parser's recipients to know that what they have is not what NIST /* distributed, so that any problems introduced by others will not /* reflect on our reputation. /* /* Policies /* /* 1. Anyone may copy and distribute verbatim copies of the SGML source /* code as received in any medium. /* /* 2. Anyone may modify your copy or copies of SGML parser source code or /* any portion of it, and copy and distribute such modifications provided /* that all modifications are clearly associated with the entity that /* performs the modifications. /* /* NO WARRANTY /* =========== /* /* NIST PROVIDES ABSOLUTELY NO WARRANTY. THE SGML PARSER AND VALIDATION /* SUITE ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER /* EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED /* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. /* THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS /* WITH YOU. SHOULD THE SGML PARSER OR VALIDATION SUITE PROVE DEFECTIVE, /* YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. /* /* IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW WILL NIST BE LIABLE FOR /* DAMAGES, INCLUDING ANY LOST PROFITS, LOST MONIES, OR OTHER SPECIAL, /* INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR /* INABILITY TO USE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA /* BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY THIRD PARTIES OR A /* FAILURE OF THE PROGRAM TO OPERATE WITH PROGRAMS NOT DISTRIBUTED BY /* NIST) THE PROGRAM, EVEN IF YOU HAVE BEEN ADVISED OF THE POSSIBILITY OF /* SUCH DAMAGES, OR FOR ANY CLAIM BY ANY OTHER PARTY. */ /************************************************************************/ /* TITLE: SGML PARSER */ /* SYSTEM: DOCUMENT PROCESSOR */ /* SUBSYSTEM: */ /* SOURCE FILE: DIGETS.C */ /* AUTHOR: Steven Lindeman, Fred Maples */ /* */ /* DATE CREATED: */ /* LAST MODIFIED: */ /* */ /* REVISIONS */ /* WHEN WHO WHY */ /************************************************************************/ #include <stdio.h> #include <search.h> #include <ctype.h> #include "didefs.h" #include "diglobal.h" /*------------------------------------------------------*/ /* G E T D E L I M */ /* */ /* Called by: GETTOKEN */ /* */ /* Returns: ETAGO, STAGO, PIO, MDO, */ /* DELIM, or EOF */ /* */ /* reads a delimeter from input */ /*------------------------------------------------------*/ getdelim() { register int inchar,retval; switch(inchar=our_fgetc(indoc)) { case '<': switch(inchar=our_fgetc(indoc)) { case '/': /* found immediately after TAGO, */ if (isalpha(inchar=our_fgetc(indoc))) { our_ungetc(inchar,indoc); putstr_outbuf("\n[/"); /* so it's an endtag */ retval = ETAGO; } else { our_ungetc(inchar,indoc); unget_string("</"); retval = NODELIM; } break; case '?': /* found a processing instruction */ putstr_outbuf("\n[?"); retval = PIO; break; case '!': /* found a markup declaration open */ if ((inchar=our_fgetc(indoc)) == '-') if ((inchar=our_fgetc(indoc)) == '-') { unget_string("--"); putstr_outbuf("<!"); retval = MDO; } else { our_ungetc('-',indoc); our_ungetc(inchar,indoc); retval = NODELIM; } else if (isalpha(inchar) || inchar=='[' || inchar==MARKUP_END) { our_ungetc(inchar,indoc); putstr_outbuf("<!"); retval = MDO; } else { our_ungetc(inchar,indoc); retval = NODELIM; } break; default: if (isalpha(inchar)) { our_ungetc(inchar,indoc); putstr_outbuf("\n["); retval = STAGO; } else { our_ungetc(inchar,indoc); our_ungetc('<',indoc); retval = NODELIM; } break; } break; case EOF: retval = EOF; /* no more data */ break; default: our_ungetc(inchar,indoc); retval = NODELIM; /* no delimeter was found, probably data */ break; } return(retval); } /*------------------------------------------------------*/ /* G E T C D A T A */ /* This routine reads character data from 'indoc'. */ /* CDATA is terminated by an etago, delimiter */ /* in context. That is a '</' followed by a name */ /* start character. */ /* */ /* returns -- NFDHT, FOUND */ /*------------------------------------------------------*/ STATUS getcdata() { int inchar; unsigned num_cr; char *outstr; STATUS retval; register BOOLEAN more_cdata,cr_found; BOOLEAN firsttime; retval = NFDHT; outstr = get_char_mem(2); flush_buf(); more_cdata = firsttime = TRUE; num_cr = 0; while(more_cdata==TRUE && (inchar=our_fgetc(indoc))!=EOF) { cr_found = save_crs(&num_cr,&inchar); if (inchar == '<') if ((inchar=our_fgetc(indoc)) == '/') if (isalpha(inchar=our_fgetc(indoc))) { more_cdata = FALSE; our_ungetc(inchar,indoc); /* unget in reverse order */ unget_string("</"); } else { retval = FOUND; check_cr(&num_cr,cr_found,&firsttime,FALSE); (*print_ctr)(ctrfp,"</"); (*applic)(DATA_STG,"</",""); our_ungetc(inchar,indoc); } else { retval = FOUND; check_cr(&num_cr,cr_found,&firsttime,FALSE); (*put_ctr)('<',ctrfp); (*applic)(DATA_STG,"<",""); our_ungetc(inchar,indoc); } else if (inchar == OUR_EE) { if (entstack[--entitylevel] != lookstack()) ourexit(2,"\nError: Entity End occurred in different character data.\n"); } else { retval = FOUND; check_cr(&num_cr,cr_found,&firsttime,FALSE); *outstr = inchar; (*applic)(DATA_STG,outstr,""); (*put_ctr)(inchar,ctrfp); } } if (retval == FOUND) (*put_ctr)('|',ctrfp); free(outstr); return(retval); } /*------------------------------------------------------*/ /* G E T P C D A T A */ /* This routine reads parsable character data */ /* from 'indoc'. PCDATA is terminated by an */ /* etago, delimiter in context, given that all */ /* entities have been closed. That is, a '</' */ /* followed by a name start character. General */ /* entities, as well as numeric and named char- */ /* acter references are resolved. */ /* */ /* returns -- NFDHT, FOUND */ /*------------------------------------------------------*/ STATUS getpcdata(genthead,penthead) ENTITYDESC *genthead,*penthead; { int inchar,token,token2; unsigned num_cr; char *outstr; BOOLEAN more_pcdata,more_subdata,cr_found,pcdata_ft; STENTRY *tp; TKNRETVAL tknretval; STATUS retval; TNODE *newcm; flush_buf(); outstr = get_char_mem(2); num_cr = 0; retval = NFDHT; tknretval = TEXT; /* so will read all the data first */ our_ungetc(inchar=our_fgetc(indoc),indoc); /* initialize inchar */ token = -1; tp = NULL; /* just to satisfy lint */ more_pcdata = pcdata_ft = TRUE; while(more_pcdata && inchar!=EOF) { if (open_rcdata_ms) retval = getrcdata(genthead,FALSE,&pcdata_ft,FALSE); else if (open_cdata_ms) retval = get_cdata_ms(&pcdata_ft); else if (tknretval == TEXT) { more_subdata = TRUE; while(more_subdata && (inchar=our_fgetc(indoc))!=EOF) { cr_found = save_crs(&num_cr,&inchar); if (inchar == EOF) more_subdata = pcdata_ft = FALSE; else if (inchar == ']') if ((inchar=our_fgetc(indoc)) == ']') if ((inchar=our_fgetc(indoc)) == MARKUP_END) { if (--num_open_ms < 0) ourexit(2,"\nError: Marked section end outside of declaration.\n"); } else { retval = FOUND; check_cr(&num_cr,cr_found,&pcdata_ft,FALSE); our_ungetc(inchar,indoc); (*applic)(DATA_STG,"]]",""); (*print_ctr)(ctrfp,"]]"); } else { retval = FOUND; check_cr(&num_cr,cr_found,&pcdata_ft,FALSE); our_ungetc(inchar,indoc); (*put_ctr)(']',ctrfp); (*applic)(DATA_STG,"]",""); } else if (inchar == '<') if ((inchar=our_fgetc(indoc)) == '/') if (isalpha(inchar=our_fgetc(indoc))) { more_subdata = pcdata_ft = FALSE; our_ungetc(inchar,indoc); unget_string("</"); } else { retval = FOUND; check_cr(&num_cr,cr_found,&pcdata_ft,FALSE); (*print_ctr)(ctrfp,"</"); (*applic)(DATA_STG,"</",""); our_ungetc(inchar,indoc); } else /* found markup */ if (inchar=='?' || isalpha(inchar)) { more_subdata = pcdata_ft = FALSE; our_ungetc(inchar,indoc); our_ungetc('<',indoc); } else if (inchar == '!') retval = check_for_mdo(&more_subdata,&num_cr,cr_found,&pcdata_ft); else { retval = FOUND; check_cr(&num_cr,cr_found,&pcdata_ft,FALSE); (*put_ctr)('<',ctrfp); (*applic)(DATA_STG,"<",""); our_ungetc(inchar,indoc); } else { retval = FOUND; check_cr(&num_cr,cr_found,&pcdata_ft,FALSE); try_entref(inchar,genthead,FALSE,&pcdata_ft); } } check_cr(&num_cr,cr_found,&pcdata_ft,TRUE); } else if (find_except(currincl,token) && !find_except(currexcl,token)) { newcm = pushcreate(tp); if ((retval = traverse(newcm,tp,genthead,penthead,&pcdata_ft)) == NFSH) { if (tknretval == TEXT) sprintf(error_msg,"%s%s%s","\nError: Invalid data, last opened element '",tp->nametoken,"'.\n"); else sprintf(error_msg,"%s%s%s","\nError: Invalid tag, last opened element '",tp->nametoken,"'.\n"); FATAL_ERROR() } /* check to make sure the element has content */ if (EMPTY_CONTENT(newcm)) { /* can't have endtag for EMPTY */ token2 = token | HIGHBIT; putstr_outbuf("\n[/"); putstr_outbuf(tp->nametoken); putstr_outbuf("]"); place_in_queue(END_TAG_NAME,tp->nametoken,""); } else tknretval = gettoken(&tp,&token2,genthead,penthead,&pcdata_ft); /* must be end tag */ if (IS_STARTTAG(token2) || IS_ENDTAG_NOTEQ(token2,token)) resolve_endtag(tp->cmptr,token2,tp,&retval,tknretval,genthead,penthead,token); popfree(newcm); /* through with this content model */ } else { more_pcdata = FALSE; ungettoken(token,tp); } if (more_pcdata) tknretval = gettoken(&tp,&token,genthead,penthead,&pcdata_ft); } if (retval==FOUND && !cr_found) (*put_ctr)('|',ctrfp); free(outstr); return(retval); } /*------------------------------------------------------*/ /* G E T R C D A T A */ /* Reads 'indoc' for replaceable character data. */ /* Entity references are resolved normally. */ /* RCDATA is terminated by an etago, delimiter */ /* in context. That is a '</' followed by a name */ /* start character. */ /* */ /* returns -- NFDHT, FOUND */ /*------------------------------------------------------*/ STATUS getrcdata(genthead,look_for_endtag,firsttime,end_of_data) ENTITYDESC *genthead; BOOLEAN look_for_endtag,*firsttime,end_of_data; { int inchar, prev_entitylevel; unsigned num_cr; BOOLEAN more_rcdata, same_entity, cr_found, rcdata_ft; STATUS retval; more_rcdata = same_entity = TRUE; flush_buf(); retval = NFDHT; num_cr = 0; rcdata_ft = *firsttime; while(more_rcdata && (inchar=our_fgetc(indoc))!=EOF) { cr_found = save_crs(&num_cr,&inchar); if (inchar=='<' && look_for_endtag) if ((inchar=our_fgetc(indoc)) == '/') if (isalpha(inchar=our_fgetc(indoc)) && same_entity) { more_rcdata = FALSE; our_ungetc(inchar,indoc); /* unget in reverse order */ unget_string("</"); } else { retval = FOUND; check_cr(&num_cr,cr_found,&rcdata_ft,FALSE); (*print_ctr)(ctrfp,"</"); (*applic)(DATA_STG,"</",""); our_ungetc(inchar,indoc); } else { retval = FOUND; check_cr(&num_cr,cr_found,&rcdata_ft,FALSE); (*put_ctr)('<',ctrfp); (*applic)(DATA_STG,"<",""); our_ungetc(inchar,indoc); } else if (inchar==']' && !look_for_endtag) if ((inchar=our_fgetc(indoc)) == ']') if ((inchar=our_fgetc(indoc))==MARKUP_END && same_entity) { more_rcdata = open_rcdata_ms = FALSE; unget_string("]]>"); } else { retval = FOUND; check_cr(&num_cr,cr_found,&rcdata_ft,FALSE); (*print_ctr)(ctrfp,"]]"); (*applic)(DATA_STG,"]]",""); our_ungetc(inchar,indoc); } else { retval = FOUND; check_cr(&num_cr,cr_found,&rcdata_ft,FALSE); (*print_ctr)(ctrfp,"]"); (*applic)(DATA_STG,"]",""); our_ungetc(inchar,indoc); } else if (inchar == OUR_EE) { if (entstack[--entitylevel] != lookstack()) ourexit(2,"\nError: Entity End occurred in different replaceable character data.\n"); check_cr(&num_cr,cr_found,&rcdata_ft,FALSE); same_entity = TRUE; } else { retval = FOUND; check_cr(&num_cr,cr_found,&rcdata_ft,FALSE); prev_entitylevel = entitylevel; try_entref(inchar,genthead,FALSE,&rcdata_ft); same_entity = (prev_entitylevel == entitylevel) ? TRUE : FALSE; } } if (retval==FOUND && end_of_data) (*put_ctr)('|',ctrfp); return(retval); } /*------------------------------------------------------*/ /* G E T _ M A R K E D _ S E C T I O N */ /* This routine processes a marked section. If */ /* the section is an INCLUDE section, processing */ /* is returned to gettoken, else the entire section */ /* is processed and then control is returned. */ /*------------------------------------------------------*/ void get_marked_section(penthead) ENTITYDESC *penthead; { register int inchar,statkey; int begnum_open; BOOLEAN moredata,close_read=FALSE; if ((inchar=our_fgetc(indoc)) != '[') ourexit(2,"\nError: DSO not found in marked section.\n"); statkey = get_status_keyword(penthead); while(inputps(penthead) > 0) gettilnosep(); if ((inchar=our_fgetc(indoc)) != '[') ourexit(2,"\nError: DSO not found in marked section.\n"); switch(statkey) { case MS_INCLUDE: break; case MS_CDATA: open_cdata_ms = TRUE; break; case MS_RCDATA: open_rcdata_ms = TRUE; break; case MS_IGNORE: begnum_open = num_open_ms-1; /* already incremented */ moredata = TRUE; while(moredata && (inchar=our_fgetc(indoc))!=EOF) if (inchar=='<' && (inchar=our_fgetc(indoc))=='!' && (inchar=our_fgetc(indoc))=='[') { if (++num_open_ms > TAGLVL) ourexit(2,"\nError: Number open marked sections > TAGLVL.\n"); } else if (inchar==']' && (inchar=our_fgetc(indoc))==']' && (inchar=our_fgetc(indoc))==MARKUP_END) { if (--num_open_ms == begnum_open) moredata = FALSE; } else if (inchar == OUR_EE) ourexit(2,"\nError: Entity End found in IGNORE marked section.\n"); STRIP_CRs(); close_read = TRUE; break; default: software_fault(); break; } if (!close_read && statkey!=MS_INCLUDE && statkey!=MS_CDATA && statkey!=MS_RCDATA) { if ((inchar=our_fgetc(indoc))!=']' || (inchar=our_fgetc(indoc))!=']') ourexit(2,"\nError: MDO not found in marked section.\n"); if ((inchar=our_fgetc(indoc)) != MARKUP_END) ourexit(2,"\nError: MDC not found in marked section.\n"); } return; } /*------------------------------------------------------*/ /* G E T _ N A M E */ /* Reads from the input document for a valid */ /* SGML name. An error condition is raised if */ /* the length of the name is greater than NAMELEN. */ /*------------------------------------------------------*/ get_name(name,capitalize) char name[]; int (*capitalize)(); { int inchar,indx; memset(name,'\0',NAMELEN+1); indx=0; if (isalpha(inchar=our_fgetc(indoc))) { putchar_outbuf(name[indx++]=(*capitalize)(inchar)); fillup(name,&indx,capitalize); if (indx > NAMELEN) { sprintf(error_msg,"%s%s%s","\nError: Length of name beginning '",name,"' > NAMELEN\n"); FATAL_ERROR() } } else { name[indx++] = (*capitalize)(inchar); fillup(name,&indx,capitalize); sprintf(error_msg,"%s%s%s","\nError: Name '",name,"' must start with name start character\n"); FATAL_ERROR() } return(indx); } /*------------------------------------------------------*/ /*------------------------------------------------------*/ /*------------------------------------------------------*/ get_entname(name,capitalize) char name[]; int (*capitalize)(); { int inchar,indx; memset(name,'\0',NAMELEN+1); indx=0; if (isalpha(inchar=our_fgetc(indoc))) { name[indx++] = (*capitalize)(inchar); fillup2(name,&indx,capitalize); if (indx > NAMELEN) { sprintf(error_msg,"%s%s%s","\nError: Length of name beginning '",name,"' > NAMELEN\n"); FATAL_ERROR() } } else { name[indx++] = (*capitalize)(inchar); fillup2(name,&indx,capitalize); sprintf(error_msg,"%s%s%s","\nError: Name '",name,"' must start with name start character\n"); FATAL_ERROR() } return(indx); } /*------------------------------------------------------*/ /* G E T _ N U T O K E N */ /* Reads from the input document for a valid */ /* SGML nutoken. An error condition is raised */ /* if the length of the nutoken is greater than */ /* NAMELEN. */ /*------------------------------------------------------*/ get_nutoken(nutoken,capitalize) char nutoken[]; int (*capitalize)(); { int inchar,indx; memset(nutoken,'\0',NAMELEN+1); indx=0; inchar=our_fgetc(indoc); if (isdigit(inchar)) { /* nutoken must start with numeral */ putchar_outbuf(nutoken[indx++]=(*capitalize)(inchar)); fillup(nutoken,&indx,capitalize); if (indx > NAMELEN) { sprintf(error_msg,"%s%s%s","\nError: Length of nutoken beginning '",nutoken,"' > NAMELEN\n"); FATAL_ERROR() } } else { nutoken[indx++] = (*capitalize)(inchar); fillup(nutoken,&indx,capitalize); sprintf(error_msg,"%s%s%s","\nError: Nutoken '",nutoken,"' must start with numeral.\n"); FATAL_ERROR() } return(indx); } /*------------------------------------------------------*/ /* G E T _ N M T O K E N */ /* Reads from the input document for a valid */ /* SGML nmtoken. An error condition is raised */ /* if the length of the nmtoken is greater than */ /* NAMELEN. */ /*------------------------------------------------------*/ get_nmtoken(nmtoken,capitalize) char nmtoken[]; int (*capitalize)(); { int indx=0; memset(nmtoken,'\0',NAMELEN+1); fillup(nmtoken,&indx,capitalize); if (indx > NAMELEN) { sprintf(error_msg,"%s%s%s","\nError: Length of nmtoken beginning '",nmtoken,"' > NAMELEN\n"); FATAL_ERROR() } return(indx); } /*------------------------------------------------------*/ /* G E T _ N U M B E R */ /* Reads from the input document for a valid */ /* SGML number. An error condition is raised */ /* if the length of the number is greater than */ /* NAMELEN. */ /*------------------------------------------------------*/ get_number(number,capitalize) char number[]; int (*capitalize)(); { int indx=0; memset(number,'\0',NAMELEN+1); while(isdigit(number[indx]=our_fgetc(indoc)) && indx<=NAMELEN) putchar_outbuf(number[indx++]); if (indx > NAMELEN) { sprintf(error_msg,"%s%s%s","\nError: Length of number beginning '",number,"' > NAMELEN.\n"); FATAL_ERROR() } if (indx == 0) { sprintf(error_msg,"%s%s%s","\nError: Invalid number, found '",number,"'.\n"); FATAL_ERROR() } our_ungetc(number[indx],indoc); return(indx); } /*------------------------------------------------------*/ /* G E T _ P I */ /* This routine reads a processing instruction */ /* from 'indoc'. No parsing is done on the p.i. */ /* The p.i. is terminated by a TAGC. The output */ /* buffer has already been flushed, therefore */ /* the p.i. is dumped directly into 'outdoc'. */ /*------------------------------------------------------*/ void get_pi() { register int inchar,pi_length; char outpi[PILEN+1]; pi_length = 0; while((inchar=our_fgetc(indoc))!=MARKUP_END && inchar!=EOF && PILEN>pi_length) { if (inchar == OUR_EE) ourexit(2,"\nError: EE is invalid in processing instruction.\n"); else (*put_ctr)(inchar,ctrfp); outpi[pi_length++] = inchar; } (*put_ctr)(']',ctrfp); STRIP_CRs(); if (pi_length > PILEN) ourexit(2,"\nError: Length of processing instruction > PILEN.\n"); outpi[pi_length] = '\0'; (*applic)(PROC_INST,outpi,""); return; } /*------------------------------------------------------*/ /* G E T _ S T A T U S _ K E Y W O R D */ /* This routine will parse the status keyword */ /* specification of a marked secttion declaration. */ /* Zero or more status keywords are allowed in */ /* the specification. If none is specified, the */ /* default of INCLUDE is returned. If multiple */ /* keywords are defined, the following priority */ /* is used (highest shown first): */ /* "IGNORE" */ /* "CDATA" */ /* "RCDATA" */ /* "INCLUDE" */ /*------------------------------------------------------*/ get_status_keyword(penthead) ENTITYDESC *penthead; { int inchar,retval; char keyname[NAMELEN+1]; retval = MS_INCLUDE; /* if none are specified, INCLUDE is assumed */ while(inputps(penthead) > 0); gettilnosep(); while((inchar=our_fgetc(indoc)) != '[') { our_ungetc(inchar,indoc); get_entname(keyname,our_toupper); if (strcmp(keyname,"IGNORE") == 0) retval = MAX(MS_IGNORE,retval); else if (strcmp(keyname,"CDATA") == 0) retval = MAX(MS_CDATA,retval); else if (strcmp(keyname,"RCDATA") == 0) retval = MAX(MS_RCDATA,retval); else if (strcmp(keyname,"INCLUDE") == 0) retval = MAX(MS_INCLUDE,retval); else if (strcmp(keyname,"TEMP") != 0) ourexit(2,"\nError: Illegal status keyword in marked section\n"); while(inputps(penthead) > 0); gettilnosep(); } our_ungetc(inchar,indoc); return(retval); } /*--------------------------------------------------------------*/ /* G E T T O K E N */ /* This routine attempts at all costs to get a tag from */ /* the document. If a tag has already been read and */ /* "ungettoken"d, then that tag is returned. If not, */ /* then parsing continues eating up all comments and */ /* processing instructions. Marked sections are opened */ /* and processed as far as possible, meaning until data */ /* is found. */ /*--------------------------------------------------------------*/ TKNRETVAL gettoken(tp,token,genthead,penthead,get_ft) int *token; STPTR *tp; ENTITYDESC *genthead,*penthead; BOOLEAN *get_ft; { char genid[NAMELEN+1]; /* generic identifier read from indoc */ int curr_delim, /* current delimiter working with */ inchar, /* current input character */ open_token; unsigned nleng_spec_list, /* normalized length of specification list */ num_id_idref; /* number of ID and IDREF attribute values */ STENTRY *opened_tp; TKNRETVAL retval; /* either MARKUP_FOUND or TEXT */ nleng_spec_list = num_id_idref = 0; curr_delim = PIO; if (state == GETNEW) { while(curr_delim==PIO || (curr_delim==MDO && !open_cdata_ms && !open_rcdata_ms)) { /* get input from input document */ flush_buf(); open_token = ((opened_tp=lookstack()) == NULL) ? rootid : opened_tp->tokenid; if (num_open_ms > 0) get_ms_closes(); if (symtable[open_token].content_type==ELEMENT_CONTENT && !open_cdata_ms && !open_rcdata_ms) { while ((inchar=our_fgetc(indoc))=='&' || inchar==RE || inchar==RS || inchar==SEPCHAR || inchar==SPACE || inchar==OUR_EE) { try_entref(inchar,genthead,TRUE,&dontcare); gettilnosep(); } our_ungetc(inchar,indoc); } if (num_open_ms > 0) get_ms_closes(); switch(curr_delim=getdelim()) { case NODELIM: case EOF: retval = TEXT; /* if find EOF, just assume it was TEXT */ break; case PIO: retval = TEXT; /* just an assumption */ flush_buf(); /* flush delimiter out */ *get_ft = TRUE; get_pi(); break; case MDO: retval = TEXT; /* just an assumption */ inchar = our_fgetc(indoc); our_ungetc(inchar,indoc); if (inchar == MARKUP_END) { /* null comment */ CLEAR_BUF(); /* clear out MDO */ if ((inchar=our_fgetc(indoc)) != MARKUP_END) ourexit(2,"\nError: MDO not found for comment declaration\n"); STRIP_CRs(); } else if (inchar == '-') { /* regular comment */ CLEAR_BUF(); /* clear out MDO */ while(inputps(penthead) > 0); if ((inchar=our_fgetc(indoc)) != MARKUP_END) ourexit(2,"\nError: MDO not found for comment declaration\n"); STRIP_CRs(); } else { CLEAR_BUF(); /* flush delimiter out */ if (++num_open_ms > TAGLVL) ourexit(2,"\nError: Number of open marked sections > TAGLVL\n"); get_marked_section(penthead); } break; case ETAGO: get_name(genid,our_toupper); sprintf(lastread_tag,"</%s>",genid); place_in_queue(END_TAG_NAME,genid,""); retval = MARKUP_FOUND; if ((*tp=(STPTR)bsearch(genid,symtable,numsym,sizeof(STENTRY),compare)) != NULL) *token = (*tp)->tokenid; else { sprintf(error_msg,"%s%s%s","\nError: Unknown generic identifier '",genid,"' in endtag.\n"); FATAL_ERROR() } *token |= HIGHBIT; /* turn high bit on for end tags */ gettilnosep(); if ((inchar=our_fgetc(indoc)) != MARKUP_END) { sprintf(error_msg,"%s%s%s","\nError: TAGC not found for '",genid,"'.\n"); FATAL_ERROR() } putchar_outbuf(']'); /* TAGC to buffer */ STRIP_CRs(); *get_ft = TRUE; break; case STAGO: retval = get_starttag(token,tp,genthead,get_ft,&nleng_spec_list,&num_id_idref); break; default: software_fault(); } /*switch*/ } /*while*/ } else { /* get input from intermediate source, i.e. after ungettoken */ state = GETNEW; /* next time get from document */ *token = holdtoken; *tp = holdtp; retval = MARKUP_FOUND; } if (nleng_spec_list > ATTSPLEN) ourexit(2,"\nError: Normalized length of attribute spec list > ATTSPLEN\n"); if (num_id_idref > GRPCNT) ourexit(2,"\nError: Total number of id reference names > GRPCNT.\n"); return(retval); } /*--------------------------------------------------------------*/ /* G E T _ S T A R T T A G */ /* This routine handles the processing of a start tag. */ /* First the name of the tag is read and then a search */ /* is made to ensure that the name is a valid generic */ /* identifier. The attributes and their values are */ /* then read in and verified one at a time. */ /*--------------------------------------------------------------*/ TKNRETVAL get_starttag(token,tp,genthead,get_ft,nleng_spec_list,num_id_idref) int *token; STPTR *tp; ENTITYDESC *genthead; BOOLEAN *get_ft; unsigned *nleng_spec_list,*num_id_idref; { char genid[NAMELEN+1], /* generic identifier read from indoc */ attrname[NAMELEN+1]; /* name of attribute value */ int inchar, /* current input character */ leng, /* length of attribute name */ temp_bufptr, tagsize; /* current length of tag */ ATTRDESC *thisadp; /* points to description of attribute */ BOOLEAN notat_specified; TKNRETVAL retval; /* either MARKUP_FOUND or TEXT */ notat_specified = FALSE; tagsize = get_name(genid,our_toupper); sprintf(lastread_tag,"<%s>",genid); place_in_queue(TAG_NAME,genid,""); retval = MARKUP_FOUND; if ((*tp=(STPTR)bsearch(genid,symtable,numsym,sizeof(STENTRY),compare)) != NULL) *token = (*tp)->tokenid; else { sprintf(error_msg,"%s%s%s","\nError: Unknown generic identifier '",genid,"'.\n"); FATAL_ERROR() } (*tp)->cmptr->contref_attr = FALSE; unprocess((*tp)->adptr); tagsize += gettilnosep(); temp_bufptr = bufptr; while((inchar=our_fgetc(indoc)) != MARKUP_END) { our_ungetc(inchar,indoc); putchar_outbuf(' '); leng = get_name(attrname,our_toupper); *nleng_spec_list += leng+NORMSEP; tagsize += leng; if ((thisadp=find_attr(attrname,(*tp)->adptr)) == NULL) { sprintf(error_msg,"%s%s%s","\nError: Unknown attribute name'",attrname,"'.\n"); FATAL_ERROR() } else if (thisadp->processed == TRUE) { sprintf(error_msg,"%s%s%s","\nError: Duplicate attribute specifications '",thisadp->attrname,"'.\n"); FATAL_ERROR() } else *nleng_spec_list += get_attrvalue(thisadp,genthead,&tagsize,&((*tp)->cmptr->contref_attr),&(notat_specified)); tagsize += gettilnosep(); } bufptr = temp_bufptr; if (req_not_proc((*tp)->adptr) == TRUE) { sprintf(error_msg,"%s%s%s","\nError: REQUIRED or CURRENT attribute not specified '", (*tp)->adptr->attrname,"'.\n"); FATAL_ERROR() } if (tagsize > TAGLEN) ourexit(3,"\nLength of undelimited start tag > TAGLEN.\n"); *num_id_idref += resolve_attr((*tp)->adptr,FALSE); place_in_queue(TAG_END,"",""); if ((*tp)->adptr == NULL) putchar_outbuf(']'); else putstr_outbuf("\n]"); STRIP_CRs(); *get_ft = FALSE; return(retval); } /*------------------------------------------------------*/ /* G E T _ A T T R V A L U E */ /* This routine processes the attribute value part */ /* of an attribute specification. The value is */ /* checked for correctness in terms of syntax as */ /* well as semantics. */ /*------------------------------------------------------*/ get_attrvalue(thisadp,genthead,taglen,contref,notat_specified) ATTRDESC *thisadp; ENTITYDESC *genthead; int *taglen; BOOLEAN *contref,*notat_specified; { char name[NAMELEN+1], buffer[ATTSPLEN+1], idrefname[NAMELEN+1], idname[NAMELEN+1]; unsigned length, nleng_attrval, num_csdata; register int inchar; int delim, val, (*getone)(); BOOLEAN more_attr_vals; GROUPDESC *groupptr; (*taglen) += gettilnosep()+1; length = 0; nleng_attrval = NORMSEP; thisadp->processed = TRUE; if ((inchar=our_fgetc(indoc)) != '=') /* name has already been read */ ourexit(2,"\nError: Invalid value indicator in attribute specification.\n"); putchar_outbuf('='); (*taglen) += gettilnosep() + 1; if ((delim=our_fgetc(indoc))==LITA || delim==LIT) putchar_outbuf(delim); else ourexit(2,"\nError: LIT or LITA not specified in attribute specification\n"); BLANK(buffer,ATTSPLEN+1); switch(thisadp->dvcode) { case NAME: case NAMES: case NOTATION: getone = get_name; break; case NUMBER: case NUMBERS: getone = get_number; break; case NMTOKEN: case NMTOKENS: case GROUP: getone = get_nmtoken; break; case NUTOKEN: case NUTOKENS: getone = get_nutoken; break; } (*taglen) += process_attr(buffer,delim,genthead,thisadp->dvcode,&num_csdata); nleng_attrval += num_csdata*NORMSEP; (*taglen)++; /* close delimiter */ if (thisadp->dvcode != ENUM_CDATA) { unget_string(buffer); gettilnosep(); } if (thisadp->defcode == A_CONREF) { if (*notat_specified == TRUE) ourexit(2,"\nError: Content reference attribute not allowed with notation attribute.\n"); *contref = TRUE; } more_attr_vals = TRUE; switch(thisadp->dvcode) { case ENUM_CDATA: putstr_outbuf(buffer); nleng_attrval += strlen(buffer); length = strlen(buffer); get_close(delim); check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,length); break; case GROUP: case NOTATION: length = (*getone)(name,our_toupper); /* GROUP is actually a nmtoken */ if (thisadp->dvcode == NOTATION) { if (*contref == TRUE) { sprintf(error_msg,"%s%s%s","\nError: Notation attribute '",name,"' specified after content reference.\n"); FATAL_ERROR() } *notat_specified = TRUE; } nleng_attrval += length+NORMSEP; /* value must have been defined as part of the group */ if ((groupptr=find_group(name,thisadp->groupp)) == NULL) { sprintf(error_msg,"%s'%s'.\n","\nError: Unknown attribute group member ",name); FATAL_ERROR() } else thisadp->u2.currgrp = groupptr; get_close(delim); check_fixed(thisadp->defcode,name,thisadp->u2.currgrp->groupname,NAMELEN); break; case NAME: case NMTOKEN: case NUTOKEN: case NUMBER: length = (*getone)(buffer,our_toupper,FALSE); nleng_attrval += length+NORMSEP; get_close(delim); check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,strlen(thisadp->u2.currdef)); break; case NUMBERS: case NAMES: case NMTOKENS: case NUTOKENS: while(more_attr_vals) { /* process each attribute value */ /* in list, one at a time */ val = (*getone)(buffer+length,our_toupper); nleng_attrval += NORMSEP + val; length += val; if (gettilnosep() != 0) length++; more_attr_vals = ((inchar=our_fgetc(indoc)) != delim); if (inchar != delim) { putchar_outbuf(' '); our_ungetc(inchar,indoc); } } putchar_outbuf(inchar); check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,length); break; case ID: length = get_name(buffer,our_toupper); nleng_attrval += length+NORMSEP; strcpy(idname,buffer); get_close(delim); check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,strlen(thisadp->u2.currdef)); break; case IDREF: length = get_name(buffer,our_toupper); nleng_attrval += length+NORMSEP; strcpy(idrefname,buffer); get_close(delim); check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,strlen(thisadp->u2.currdef)); break; case IDREFS: while(more_attr_vals) { /* process each attribute value */ /* of the list one at a time */ val = get_name(buffer+length,our_toupper); strncpy(idrefname,buffer+length,val); length += val; nleng_attrval += NORMSEP+val; if (gettilnosep() != 0) length++; more_attr_vals = ((inchar=our_fgetc(indoc)) != delim); if (inchar != delim) { putchar_outbuf(' '); our_ungetc(inchar,indoc); } } putchar_outbuf(inchar); check_fixed(thisadp->defcode,buffer,thisadp->u2.currdef,length); break; case ENTITY: length = get_name(name,nullfnc); nleng_attrval += length+NORMSEP; if (find_entity(genthead,name,FALSE) == NULL) { sprintf(error_msg,"%s'%s'.\n","\nError: Unknown attribute general entity name ",name); FATAL_ERROR() } check_fixed(thisadp->defcode,name,thisadp->u2.currdef,strlen(thisadp->u2.currdef)); get_close(delim); break; default: software_fault(); break; } if (nleng_attrval > LITLEN) ourexit(2,"\nError: Normalized length of attribute value > LITLEN\n"); if (thisadp->dvcode!=NOTATION && thisadp->dvcode!=GROUP) { if (thisadp->u2.currdef != NULL) free(thisadp->u2.currdef); thisadp->u2.currdef = get_char_mem(length+1); buffer[length] = '\0'; strcpy(thisadp->u2.currdef,buffer); } return(nleng_attrval); } /*------------------------------------------------------*/ /* G E T _ C D A T A _ M S */ /*------------------------------------------------------*/ STATUS get_cdata_ms(firsttime) BOOLEAN *firsttime; { BOOLEAN moredata,cr_found,cdata_ms_ft; STATUS retval; int inchar; unsigned num_cr; char *outstr; flush_buf(); retval = NFDHT; cdata_ms_ft = *firsttime; outstr = get_char_mem(2); moredata = TRUE; while(moredata && (inchar=our_fgetc(indoc))!=EOF) { cr_found = save_crs(&num_cr,&inchar); if (inchar == ']') if ((inchar=our_fgetc(indoc)) == ']') if ((inchar=our_fgetc(indoc)) == MARKUP_END) { moredata = FALSE; our_ungetc(MARKUP_END,indoc); unget_string("]]"); } else { retval = FOUND; check_cr(&num_cr,cr_found,&cdata_ms_ft,FALSE); (*print_ctr)(ctrfp,"]]%c",inchar); (*applic)(DATA_STG,"]]",""); *outstr = inchar; (*applic)(DATA_STG,outstr,""); } else { retval = FOUND; check_cr(&num_cr,cr_found,&cdata_ms_ft,FALSE); (*print_ctr)(ctrfp,"]%c",inchar); (*applic)(DATA_STG,"]",""); *outstr = inchar; (*applic)(DATA_STG,outstr,""); } else { retval = FOUND; check_cr(&num_cr,cr_found,&cdata_ms_ft,FALSE); (*put_ctr)(inchar,ctrfp); *outstr = inchar; (*applic)(DATA_STG,outstr,""); } } open_cdata_ms = FALSE; free(outstr); return(retval); } /*------------------------------------------------------*/ /* G E T _ C L O S E */ /* This routine reads from 'indoc' for the */ /* delimeter passed to it as a parameter. If */ /* the delimeter is not found, an error is raised. */ /*------------------------------------------------------*/ void get_close(delim) int delim; { int inchar; if ((inchar=our_fgetc(indoc)) != delim) ourexit(2,"\nError: Lit or lita delimeter not found in attribute literal.\n"); else putchar_outbuf(inchar); return; } /*------------------------------------------------------*/ /* G E T T I L N O S E P */ /* This routine reads from the file until a */ /* non-seperator is found. */ /*------------------------------------------------------*/ gettilnosep() { register int indx; int inchar; indx = 0; /* notice we aren't writing unneeded seperators to output file */ inchar=our_fgetc(indoc); /* get character from file */ while(SEPERATOR(inchar)) { inchar=our_fgetc(indoc); indx++; } our_ungetc(inchar,indoc); return(indx); } /*------------------------------------------------------*/ /* G E T _ C H A R _ M E M */ /* This routine allocates memory for character */ /* data and raises an error condition if there */ /* is insufficient memory for the allocation. */ /*------------------------------------------------------*/ char *get_char_mem(number) int number; { char *retptr,*calloc(); if ((retptr=calloc(number,sizeof(char))) == NULL) ourexit(2,"\nInsufficient memory in parse3\n"); return(retptr); } /*------------------------------------------------------*/ /* G E T _ M S _ C L O S E S */ /* This routine reads from 'indoc' as many */ /* marked section closes as possible. */ /*------------------------------------------------------*/ void get_ms_closes() { int inchar,open_token; STENTRY *opened_tp; BOOLEAN more_ms_closes=TRUE; while(more_ms_closes && (inchar=our_fgetc(indoc))!=EOF) { if (inchar == ']') if ((inchar=our_fgetc(indoc)) == ']') if ((inchar=our_fgetc(indoc)) == MARKUP_END) { if (--num_open_ms == 0) more_ms_closes = FALSE; if ((inchar=our_fgetc(indoc)) != OUR_EE) our_ungetc(inchar,indoc); } else { our_ungetc(inchar,indoc); unget_string("]]"); more_ms_closes = FALSE; } else { our_ungetc(inchar,indoc); our_ungetc(']',indoc); more_ms_closes = FALSE; } else { our_ungetc(inchar,indoc); more_ms_closes = FALSE; } open_token = ((opened_tp=lookstack()) == NULL) ? rootid : opened_tp->tokenid; if (symtable[open_token].content_type == ELEMENT_CONTENT) gettilnosep(); /* seperators are allowed between tags */ } return; }